Geometry Operations and Spatial Analysis
Contents
Geometry Operations and Spatial Analysis#
Setup#
from sqlalchemy import create_engine
import geopandas as gpd
import pandas as pd
import requests
import shutil
import folium
import json
from pathlib import Path
engine_str = "postgresql+psycopg2://docker:docker@0.0.0.0:25432/restaurants"
engine = create_engine(engine_str)
%load_ext sql
%sql $engine.url
'Connected: docker@restaurants'
QUEENS = [40.7292, -73.9049]
def to_map(result_set, location=QUEENS, geom_col='geom', epsg=4326, zoom=13, fillcolor='orange'):
m = folium.Map(location=location, zoom_start=zoom)
df = result_set.DataFrame()
gdf = gpd.GeoDataFrame(df, geometry=gpd.GeoSeries.from_wkb(df[geom_col], crs=epsg))
if epsg != 4326:
gdf.to_crs(4326, inplace=True)
geojson = json.loads(gdf.to_json())
for r in geojson['features']:
if 'properties' in r.keys():
properties = r['properties']
else:
properties = None
popup = folium.GeoJsonPopup([c for c in properties.keys() if geom_col not in c.lower()]) if properties is not None else None
geo_j = folium.GeoJson(data=r,
style_function=lambda x: {'fillColor': fillcolor},
popup=popup
)
geo_j.add_to(m)
return m
Import & Join#
NY State Census Block Groups#
retrieved from census.gov
ny_cb = 'cb_2017_36_bg_500k'
%%bash -s "$ny_cb"
unzip -o "../data/$1.zip" -d "../data/tmp/$1"
gdalsrsinfo -e "../data/tmp/$1/$1.shp" | grep "^EPSG"
Archive: ../data/cb_2017_36_bg_500k.zip
inflating: ../data/tmp/cb_2017_36_bg_500k/cb_2017_36_bg_500k.shp.ea.iso.xml
inflating: ../data/tmp/cb_2017_36_bg_500k/cb_2017_36_bg_500k.shp.iso.xml
inflating: ../data/tmp/cb_2017_36_bg_500k/cb_2017_36_bg_500k.shp.xml
inflating: ../data/tmp/cb_2017_36_bg_500k/cb_2017_36_bg_500k.shp
inflating: ../data/tmp/cb_2017_36_bg_500k/cb_2017_36_bg_500k.shx
inflating: ../data/tmp/cb_2017_36_bg_500k/cb_2017_36_bg_500k.dbf
inflating: ../data/tmp/cb_2017_36_bg_500k/cb_2017_36_bg_500k.prj
extracting: ../data/tmp/cb_2017_36_bg_500k/cb_2017_36_bg_500k.cpg
EPSG:4269
%%bash -s "$ny_cb"
ogr2ogr -f "PostgreSQL" \
PG:"host='0.0.0.0' port='25432' user='docker' password='docker' dbname='restaurants'" "../data/tmp/$1/$1.shp" \
-nlt PROMOTE_TO_MULTI \
-nln "census_block_ny" \
-lco GEOMETRY_NAME=geom \
-a_srs "EPSG:4269" \
-overwrite
American Community Survey Population#
acs_zip = '../data/Queens_Education_Attainment_ACS_17_5YR_B15003.zip'
acs = '../data/tmp/Queens_Education_Attainment_ACS_17_5YR_B15003'
%%bash -s "$acs_zip" "$acs"
unzip -o $1 -d $2
Archive: ../data/Queens_Education_Attainment_ACS_17_5YR_B15003.zip
inflating: ../data/tmp/Queens_Education_Attainment_ACS_17_5YR_B15003/ACS_17_5YR_B15003.csv
inflating: ../data/tmp/Queens_Education_Attainment_ACS_17_5YR_B15003/ACS_17_5YR_B15003_metadata.csv
inflating: ../data/tmp/Queens_Education_Attainment_ACS_17_5YR_B15003/ACS_17_5YR_B15003.txt
inflating: ../data/tmp/Queens_Education_Attainment_ACS_17_5YR_B15003/aff_download_readme.txt
Let’s save a little time and import the data via pandas rather than manually creating a table schema and using a COPY statement.
We are only interested in these Population aggregates from the metadata:
HD01_VD01,Estimate; Total:
HD01_VD22,Estimate; Total: - Bachelor's degree
HD01_VD23,Estimate; Total: - Master's degree
HD01_VD24,Estimate; Total: - Professional school degree
HD01_VD25,Estimate; Total: - Doctorate degree
acs_df = pd.read_csv(
f"{acs}/ACS_17_5YR_B15003.csv",
usecols=[
"GEO.id",
"GEO.id2",
"GEO.display-label",
"HD01_VD01",
"HD01_VD22",
"HD01_VD23",
"HD01_VD24",
"HD01_VD25",
],
)
# Give friendly names
acs_df.rename(
columns={
"GEO.id": "geo_id_1",
"GEO.id2": "geo_id_2",
"GEO.display-label": "name",
"HD01_VD01": "pop_total",
"HD01_VD22": "pop_bachelors",
"HD01_VD23": "pop_masters",
"HD01_VD24": "pop_professional_school",
"HD01_VD25": "pop_doctorate",
},
inplace=True,
)
with engine.connect() as con:
acs_df.to_sql('queens_acs', con, if_exists='replace', index=False)
%sql SELECT geo_id_1 FROM queens_acs LIMIT 5;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
5 rows affected.
| geo_id_1 |
|---|
| 1500000US360810001001 |
| 1500000US360810002001 |
| 1500000US360810002002 |
| 1500000US360810004001 |
| 1500000US360810004002 |
%sql SELECT affgeoid FROM census_block_ny LIMIT 5;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
5 rows affected.
| affgeoid |
|---|
| 1500000US360550116014 |
| 1500000US360810273005 |
| 1500000US360470336001 |
| 1500000US360470355001 |
| 1500000US361219707003 |
%%sql
DROP TABLE IF EXISTS queens_census_pop;
CREATE TABLE queens_census_pop AS
SELECT geo_id_1 as geoid,
queens_acs.name,
pop_total,
pop_bachelors,
pop_masters,
pop_professional_school,
pop_doctorate,
geom
FROM queens_acs
LEFT JOIN census_block_ny
ON queens_acs.geo_id_1 = census_block_ny.affgeoid
WHERE geom IS NOT NULL;
CREATE INDEX IF NOT EXISTS queens_census_pop_geom_idx ON queens_census_pop USING gist(geom);
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
Done.
1739 rows affected.
Done.
[]
%%sql queens_cb_pop <<
SELECT * FROM queens_census_pop;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
1739 rows affected.
Returning data to local variable queens_cb_pop
Click on polygons to view data
to_map(queens_cb_pop)
Add geography columns to facilitate proximity analysis via ST_Buffer
%%sql
-- nyc_subway_stations
ALTER TABLE nyc_subway_stations
ADD COLUMN IF NOT EXISTS geog geography;
UPDATE nyc_subway_stations
SET geog = ST_Transform(geom, 4326)::geography;
CREATE INDEX IF NOT EXISTS nyc_subway_stations_geog_idx ON nyc_subway_stations USING gist(geog);
-- queens_census_pop
ALTER TABLE queens_census_pop
ADD COLUMN IF NOT EXISTS geog geography;
UPDATE queens_census_pop
SET geog = ST_Transform(geom, 4326)::geography;
CREATE INDEX IF NOT EXISTS queens_census_pop_geog_idx ON queens_census_pop USING gist(geog);
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
Done.
473 rows affected.
Done.
Done.
1739 rows affected.
Done.
[]
Spatial Analysis#
Find the total population, population with a bachelor’s degree or higher, and the percentage of those with a bachelor’s degree or higher within 200 and 500 meters from every subway station in Queens.
We will make use of of the ST_Intersection and ST_Buffer to calculate a “proportion of population” assuming the population is equally distributed within the census blocks.
Let’s just add those buffers directly to the nyc_subway_stations table
%%sql
ALTER TABLE nyc_subway_stations
ADD COLUMN IF NOT EXISTS buffer_500 geography;
ALTER TABLE nyc_subway_stations
ADD COLUMN IF NOT EXISTS buffer_200 geography;
UPDATE nyc_subway_stations
SET buffer_500 = ST_Buffer(geog, 500);
UPDATE nyc_subway_stations
SET buffer_200 = ST_Buffer(geog, 200);
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
Done.
Done.
473 rows affected.
473 rows affected.
[]
%%sql subway_buffer_pop <<
WITH subq1 AS (
SELECT
nss.ogc_fid,
nss.name,
nss.line,
qcp.geoid,
qcp.pop_total,
qcp.pop_bachelors + qcp.pop_masters + qcp.pop_professional_school + qcp.pop_doctorate as pop_bachelors_plus,
ST_Area(qcp.geog) as cb_area_total,
ST_Area(ST_Intersection(qcp.geog, nss.buffer_500)) as cb_area_in_buffer500,
ST_Area(ST_Intersection(qcp.geog, nss.buffer_200)) as cb_area_in_buffer200,
nss.geom
FROM queens_census_pop qcp
JOIN nyc_subway_stations nss
ON ST_INTERSECTS(qcp.geog, nss.buffer_500)
),
subq2 AS(
SELECT
ogc_fid,
name,
line,
pop_total * (cb_area_in_buffer500 / cb_area_total) as pop_total_in_buffer500,
pop_total * (cb_area_in_buffer200 / cb_area_total) as pop_total_in_buffer200,
pop_bachelors_plus * (cb_area_in_buffer500 / cb_area_total) as pop_bachelors_plus_in_buffer500,
pop_bachelors_plus * (cb_area_in_buffer200 / cb_area_total) as pop_bachelors_plus_in_buffer200,
geom
FROM subq1
)
SELECT
ogc_fid,
name,
line,
SUM(pop_total_in_buffer500)::int as pop_total_in_buffer500,
SUM(pop_total_in_buffer200)::int as pop_total_in_buffer200,
SUM(pop_bachelors_plus_in_buffer500)::int as pop_bachelors_plus_in_buffer500,
SUM(pop_bachelors_plus_in_buffer200)::int as pop_bachelors_plus_in_buffer200,
geom
FROM subq2
GROUP BY ogc_fid, name, line, geom
ORDER BY pop_bachelors_plus_in_buffer200 DESC;
* postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
90 rows affected.
Returning data to local variable subway_buffer_pop
subway_buffer_pop.DataFrame()
| ogc_fid | name | line | pop_total_in_buffer500 | pop_total_in_buffer200 | pop_bachelors_plus_in_buffer500 | pop_bachelors_plus_in_buffer200 | geom | |
|---|---|---|---|---|---|---|---|---|
| 0 | 164 | 75th Ave | E-F | 10108 | 2848 | 7206 | 2068 | 0104000020E6100000010000000101000000439B652890... |
| 1 | 67 | 67th Ave | E-M-R | 19616 | 3407 | 11493 | 2065 | 0104000020E61000000100000001010000001E15244495... |
| 2 | 37 | Elmhurst Ave | E-M-R | 21058 | 5619 | 5906 | 1802 | 0104000020E61000000100000001010000008DA1DD4173... |
| 3 | 79 | Broadway | N-W | 16821 | 2647 | 9448 | 1550 | 0104000020E610000001000000010100000074DC1BAF40... |
| 4 | 294 | 40th St | 7 | 13959 | 3167 | 6179 | 1440 | 0104000020E6100000010000000101000000359FFF1323... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 85 | 192 | Grant Ave | A-S | 1940 | 0 | 349 | 0 | 0104000020E6100000010000000101000000BD89ABFA5C... |
| 86 | 23 | Mets - Willets Point | 7-7 Express | 0 | 0 | 0 | 0 | 0104000020E6100000010000000101000000A33850B81E... |
| 87 | 305 | Jefferson St | L | 471 | 0 | 151 | 0 | 0104000020E6100000010000000101000000FC0BB00111... |
| 88 | 352 | Roosevelt Island - Main St | F | 1 | 0 | 1 | 0 | 0104000020E6100000010000000101000000AC5F5FCD01... |
| 89 | 228 | Cypress Hills | J | 107 | 0 | 25 | 0 | 0104000020E610000001000000010100000087F6F381E4... |
90 rows × 8 columns
Apparently you can find the most educated people within 200 meters of a Queens subway station at 75th Ave!